libraries

library(Seurat)
library(Matrix)
library(dplyr)
library(tidyr)
library(purrr)
library(ggplot2)
library(tibble)
library(cowplot)  
library(patchwork)
library(stringr)
library(pheatmap)
library(reshape2)
library(readxl)
library(stringr)
library(DESeq2)
library(gridExtra)
library(ggrepel)
library(ggpubr)
library(tidyverse)
library(viridisLite)

theme_set(theme_classic())


data_dir = "/home/zli4/MSK_DEC24/Analysis"

out_dir = "/home/zli4/MSK_DEC24/Analysis/outs/differential_expression"

tp = "D3"
print(tp)
## [1] "D3"

load pseudobulk samples

f_pseudobulk_list = file.path(out_dir,"pseudobulk_list.RDS")
pseudo_obj_list  = readRDS(f_pseudobulk_list)

remove pseudobulk samples with < 30 cells

min_cells = 30


pseudo_obj_list = lapply(pseudo_obj_list,function(pseudo_obj){
  pseudo_obj%>%subset(subset = n>=min_cells)
})

Differential Analysis Conditioned on Cell Type

geno2test_tp = readRDS( file.path(out_dir,"geno2test_tp.RDS"))
geno2test = geno2test_tp[[tp]]
pseudo_obj = pseudo_obj_list[[tp]]
res_all = NULL

for(ct in unique(geno2test$celltype)){

  print("==========")
  print(paste0("working on ", ct))
  print("==========")
  geno2use = geno2test%>%filter(celltype==ct)%>%pull(genotype)
  geno2use = as.character(geno2use)
  
  stopifnot(length(geno2use) > 0)
  
  dat = pseudo_obj%>%subset(celltype ==ct)%>%subset(genotype%in%(union(geno2use,"WT")))
  dat_mat = GetAssayData(dat, layer = 'counts')
  dat_mat <- dat_mat[!grepl("^ENSG", rownames(dat_mat)), ] # remove genes without gene symbols
  n_non_zero = rowSums(dat_mat > 0)
  dat_mat = dat_mat[n_non_zero >= ncol(dat_mat) * 0.5, ]
  
  meta_df = dat[[]]
  if(!"WT"%in%unique(meta_df$genotype)){
    next
  }
    
  meta_df$genotype = factor(meta_df$genotype)
  meta_df$genotype = relevel(meta_df$genotype, ref="WT")
  meta_df$log10_rd = log10(colSums(dat_mat))
  summary(meta_df$log10_rd)
  
    
  if(nrow(meta_df)<=5){# check number of samples
    next
  } 
  
  # create dds object
  if(length(unique(meta_df$source))>1){ # pseudobulk samples come from two sources (both external_WT and KO_village)
    if(all(rowSums(table(meta_df$genotype,meta_df$source)!=0)==1)){ 
      # when all WT samples come from one data source while KO samples the other
      next
    }
    if(length(unique(meta_df$background))==1){
      print("===== all pseudobulk samples come from one background ===== ")
      # all pseudobulk samples come from one background
        dds = DESeqDataSetFromMatrix(dat_mat, colData=meta_df, 
                      ~ genotype + log10_rd + source)
    }else{
        dds = DESeqDataSetFromMatrix(dat_mat, colData=meta_df, 
                      ~ genotype + log10_rd + background + source)
    }
  }else{  # all pseudobulk samples come from one source
    print("===== all pseudobulk samples come from one source===== ")
    if(length(unique(meta_df$background))==1){
      print("===== all pseudobulk samples come from one background ===== ")
      dds = DESeqDataSetFromMatrix(dat_mat, colData=meta_df, 
                              ~ genotype + log10_rd )
    }else{
      dds = DESeqDataSetFromMatrix(dat_mat, colData=meta_df, 
                              ~ genotype + log10_rd + background)
    }
  }

  vsd = vst(dds, blind=FALSE)
  
  # run PCA
  pcs = plotPCA(vsd, intgroup = c("genotype", "background", "source"), returnData = TRUE)
  genotypes2plot = setdiff(pcs$genotype, "WT")
  
  # PCA plot: color by genotype, shape by cell background
  plot_lists = list()
  for (geno1 in genotypes2plot) {
    pcs$highlight <- ifelse(
      pcs$genotype == "WT", "WT",
      ifelse(pcs$genotype == geno1, "KO", "Other")
    )
    p1 = ggplot(pcs, aes(x = PC1, y = PC2, color = highlight, 
                              alpha = highlight, shape = background)) +
      geom_point(size = 2) +
      scale_color_manual(values = c("WT" = "blue", "KO" = "red", 
                                    "Other" = "grey")) +
      scale_alpha_manual(values = c("WT" = 1, "KO" = 1, "Other" = 0.5)) +
      labs(title = paste(ct, geno1))  + 
      guides(
        color = "none", 
        alpha = "none",
        shape = guide_legend(title = "") 
      ) 
    plot_lists[[geno1]] = p1
  }
  i = 1
  while (i <= length(plot_lists)) {
    if (i < length(plot_lists)) {
      grid.arrange(plot_lists[[i]], plot_lists[[i + 1]], ncol = 2)
    } else {
      grid.arrange(plot_lists[[i]], ncol = 2)
    }
    i <- i + 2
    Sys.sleep(1) 
  }
  
  # PCA plot: color by genotype, shape by source
  plot_lists = list()
  for (geno1 in genotypes2plot) {
    pcs$highlight <- ifelse(
      pcs$genotype == "WT", "WT",
      ifelse(pcs$genotype == geno1, "KO", "Other")
    )
    p1 = ggplot(pcs, aes(x = PC1, y = PC2, color = highlight, 
                              alpha = highlight, shape = source)) +
      geom_point(size = 2) +
      scale_color_manual(values = c("WT" = "blue", "KO" = "red", 
                                    "Other" = "grey")) +
      scale_alpha_manual(values = c("WT" = 1, "KO" = 1, "Other" = 0.5)) +
      labs(title = paste(ct, geno1))  + 
      guides(
        color = "none", 
        alpha = "none",
        shape = guide_legend(title = "") 
      ) 
    plot_lists[[geno1]] = p1
  }
  i = 1
  while (i <= length(plot_lists)) {
    if (i < length(plot_lists)) {
      grid.arrange(plot_lists[[i]], plot_lists[[i + 1]], ncol = 2)
    } else {
      grid.arrange(plot_lists[[i]], ncol = 2)
    }
    i <- i + 2
    Sys.sleep(1) 
  }
  
  dds = DESeq(dds)
  
  # check cell background effect
  if(length(unique(meta_df$background))>1){
      res_k = results(dds, contrast = c("background", "HUES8", "H1"))
      res_k = as.data.frame(res_k)
      g1 = ggplot(res_k, aes(x=pvalue)) + 
        geom_histogram(color="darkblue", fill="lightblue", 
                        breaks = seq(0,1,by=0.01)) + 
        ggtitle("genetic background") + 
        guides(fill=guide_legend(ncol=2))
      w_de = which(res_k$padj < 0.05)
      res_k$delabel = NA
      res_k$delabel[w_de] = rownames(res_k)[w_de]
      res_k$diffexpressed = "No"
      res_k$diffexpressed[res_k$log2FoldChange > 0 & res_k$padj < 0.05] <- "Up"
      res_k$diffexpressed[res_k$log2FoldChange < 0 & res_k$padj < 0.05] <- "Down"
      res_k$log2FoldChange[which(res_k$log2FoldChange > 3)] = 3
      res_k$log2FoldChange[which(res_k$log2FoldChange < -3)] = -3
      col2use = c("blue", "grey", "red")
      names(col2use) = c("Down", "No", "Up")
      g2 = ggplot(res_k, aes(x=log2FoldChange, y=-log10(padj), 
                      col=diffexpressed, label=delabel)) + 
        geom_point() + 
        geom_text_repel(show.legend = FALSE) + 
        scale_color_manual(values=col2use)
      print(ggarrange(g1, g2, widths = c(2,3)))
      saveRDS(res_k, 
          file=file.path(out_dir, paste0(ct,"_DE_pseudobulk_background_effect.RDS",sep = "")))
  }
  
  if(length(unique(meta_df$source))>1){
    # check data source effect
    res_k = results(dds, contrast = c("source",  "external_WT", "KO_village" ))
    res_k = as.data.frame(res_k)
    g1 = ggplot(res_k, aes(x=pvalue)) + 
      geom_histogram(color="darkblue", fill="lightblue", 
                      breaks = seq(0,1,by=0.01)) + 
      ggtitle("data source") + 
      guides(fill=guide_legend(ncol=2))
    w_de = which(res_k$padj < 0.05)
    res_k$delabel = NA
    res_k$delabel[w_de] = rownames(res_k)[w_de]
    res_k$diffexpressed = "No"
    res_k$diffexpressed[res_k$log2FoldChange > 0 & res_k$padj < 0.05] <- "Up"
    res_k$diffexpressed[res_k$log2FoldChange < 0 & res_k$padj < 0.05] <- "Down"
    res_k$log2FoldChange[which(res_k$log2FoldChange > 3)] = 3
    res_k$log2FoldChange[which(res_k$log2FoldChange < -3)] = -3
    col2use = c("blue", "grey", "red")
    names(col2use) = c("Down", "No", "Up")
    g2 = ggplot(res_k, aes(x=log2FoldChange, y=-log10(padj), 
                    col=diffexpressed, label=delabel)) + 
      geom_point() + 
      geom_text_repel(show.legend = FALSE) + 
      scale_color_manual(values=col2use)
    print(ggarrange(g1, g2, widths = c(2,3)))
    saveRDS(res_k, 
        file=file.path(out_dir, paste0(ct,"_DE_pseudobulk_source_effect.RDS",sep = "")))
  }
  # KO vs WT DE analysis conditioned on the given cell type
  for(k in 1:length(geno2use)){
    nm_k = paste(ct, geno2use[k])
    res_k = results(dds, contrast = c("genotype", geno2use[k], "WT"))
    res_k = as.data.frame(res_k)
    table(is.na(res_k$pvalue))
  
    g1 = ggplot(res_k, aes(x=pvalue)) + 
      geom_histogram(color="darkblue", fill="lightblue", 
                      breaks = seq(0,1,by=0.01)) + 
      ggtitle(nm_k) + 
      guides(fill=guide_legend(ncol=2))
    
    w_de = which(res_k$padj < 0.05)
    res_k$delabel = NA
    res_k$delabel[w_de] = rownames(res_k)[w_de]
    res_k$diffexpressed = "No"
    res_k$diffexpressed[res_k$log2FoldChange > 0 & res_k$padj < 0.05] <- "Up"
    res_k$diffexpressed[res_k$log2FoldChange < 0 & res_k$padj < 0.05] <- "Down"
    res_k$log2FoldChange[which(res_k$log2FoldChange > 3)] = 3
    res_k$log2FoldChange[which(res_k$log2FoldChange < -3)] = -3
  
    col2use = c("blue", "grey", "red")
    names(col2use) = c("Down", "No", "Up")
    
    g2 = ggplot(res_k, aes(x=log2FoldChange, y=-log10(padj), 
                    col=diffexpressed, label=delabel)) + 
      geom_point() + 
      geom_text_repel(show.legend = FALSE) + 
      scale_color_manual(values=col2use) + 
      ggtitle(nm_k)
    
    print(ggarrange(g1, g2, widths = c(2,3)))
    
    res_k$celltype = ct
    res_k$genotype = geno2use[k]
    res_k$gene = rownames(res_k)
    rownames(res_k) = NULL
    
    res_all = rbind(res_all, res_k)
  }
}
## [1] "=========="
## [1] "working on DE"
## [1] "=========="

## [1] "=========="
## [1] "working on ESC"
## [1] "=========="
## [1] "=========="
## [1] "working on ESC_DE"
## [1] "=========="
## [1] "===== all pseudobulk samples come from one source===== "

saveRDS(res_all, 
        file=file.path(out_dir,paste0( "DE_pseudobulk_",tp,".RDS")))

Session information

gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  8195883 437.8   14714865  785.9  14714865  785.9
## Vcells 91900343 701.2  251290149 1917.2 314112685 2396.5
sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 18.04.6 LTS
## 
## Matrix products: default
## BLAS/LAPACK: FlexiBLAS OPENBLAS;  LAPACK version 3.11.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: America/Los_Angeles
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] viridisLite_0.4.2           lubridate_1.9.3            
##  [3] forcats_1.0.0               readr_2.1.5                
##  [5] tidyverse_2.0.0             ggpubr_0.6.0               
##  [7] ggrepel_0.9.5               gridExtra_2.3              
##  [9] DESeq2_1.44.0               SummarizedExperiment_1.34.0
## [11] Biobase_2.64.0              MatrixGenerics_1.16.0      
## [13] matrixStats_1.3.0           GenomicRanges_1.56.0       
## [15] GenomeInfoDb_1.40.0         IRanges_2.38.0             
## [17] S4Vectors_0.42.0            BiocGenerics_0.50.0        
## [19] readxl_1.4.3                reshape2_1.4.4             
## [21] pheatmap_1.0.12             stringr_1.5.1              
## [23] patchwork_1.2.0             cowplot_1.1.3              
## [25] tibble_3.2.1                ggplot2_3.5.1              
## [27] purrr_1.0.2                 tidyr_1.3.1                
## [29] dplyr_1.1.4                 Matrix_1.7-0               
## [31] Seurat_5.1.0                SeuratObject_5.0.2         
## [33] sp_2.1-4                   
## 
## loaded via a namespace (and not attached):
##   [1] RcppAnnoy_0.0.22        splines_4.4.0           later_1.3.2            
##   [4] cellranger_1.1.0        polyclip_1.10-6         fastDummies_1.7.3      
##   [7] lifecycle_1.0.4         rstatix_0.7.2           globals_0.16.3         
##  [10] lattice_0.22-6          MASS_7.3-60.2           backports_1.4.1        
##  [13] magrittr_2.0.3          plotly_4.10.4           sass_0.4.9             
##  [16] rmarkdown_2.26          jquerylib_0.1.4         yaml_2.3.8             
##  [19] httpuv_1.6.15           sctransform_0.4.1       spam_2.10-0            
##  [22] spatstat.sparse_3.0-3   reticulate_1.36.1       pbapply_1.7-2          
##  [25] RColorBrewer_1.1-3      abind_1.4-5             zlibbioc_1.50.0        
##  [28] Rtsne_0.17              GenomeInfoDbData_1.2.12 irlba_2.3.5.1          
##  [31] listenv_0.9.1           spatstat.utils_3.0-4    goftest_1.2-3          
##  [34] RSpectra_0.16-1         spatstat.random_3.2-3   fitdistrplus_1.2-1     
##  [37] parallelly_1.37.1       leiden_0.4.3.1          codetools_0.2-20       
##  [40] DelayedArray_0.30.1     tidyselect_1.2.1        UCSC.utils_1.0.0       
##  [43] farver_2.1.2            spatstat.explore_3.2-7  jsonlite_1.8.8         
##  [46] progressr_0.14.0        ggridges_0.5.6          survival_3.6-4         
##  [49] tools_4.4.0             ica_1.0-3               Rcpp_1.0.12            
##  [52] glue_1.7.0              SparseArray_1.4.3       xfun_0.44              
##  [55] withr_3.0.0             fastmap_1.2.0           fansi_1.0.6            
##  [58] digest_0.6.35           timechange_0.3.0        R6_2.5.1               
##  [61] mime_0.12               colorspace_2.1-0        scattermore_1.2        
##  [64] tensor_1.5              spatstat.data_3.0-4     utf8_1.2.4             
##  [67] generics_0.1.3          data.table_1.15.4       httr_1.4.7             
##  [70] htmlwidgets_1.6.4       S4Arrays_1.4.0          uwot_0.2.2             
##  [73] pkgconfig_2.0.3         gtable_0.3.5            lmtest_0.9-40          
##  [76] XVector_0.44.0          htmltools_0.5.8.1       carData_3.0-5          
##  [79] dotCall64_1.1-1         scales_1.3.0            png_0.1-8              
##  [82] knitr_1.46              rstudioapi_0.16.0       tzdb_0.4.0             
##  [85] nlme_3.1-164            cachem_1.0.8            zoo_1.8-12             
##  [88] KernSmooth_2.23-22      parallel_4.4.0          miniUI_0.1.1.1         
##  [91] pillar_1.9.0            grid_4.4.0              vctrs_0.6.5            
##  [94] RANN_2.6.1              promises_1.3.0          car_3.1-2              
##  [97] xtable_1.8-4            cluster_2.1.6           evaluate_0.23          
## [100] cli_3.6.2               locfit_1.5-9.9          compiler_4.4.0         
## [103] rlang_1.1.3             crayon_1.5.2            future.apply_1.11.2    
## [106] ggsignif_0.6.4          labeling_0.4.3          plyr_1.8.9             
## [109] stringi_1.8.4           deldir_2.0-4            BiocParallel_1.38.0    
## [112] munsell_0.5.1           lazyeval_0.2.2          spatstat.geom_3.2-9    
## [115] RcppHNSW_0.6.0          hms_1.1.3               future_1.33.2          
## [118] shiny_1.8.1.1           highr_0.10              ROCR_1.0-11            
## [121] igraph_2.0.3            broom_1.0.5             bslib_0.7.0